# Reading in Data and Saving Out
# dat <- read.csv("/Users/sofiapozsonyiova/Downloads/PubR2019-race\ no\ county\ csv/PubR2019-race\ no\ county.csv")
# save(dat, file ="/Users/sofiapozsonyiova/Downloads/FullDataSet.RData")
# save(dat_sub,file = "/Users/sofiapozsonyiova/Downloads/SubsetDat.RData")
# Loading Data in
setwd("/Users/sofiapozsonyiova/Downloads/")
load("FullDataSet.RData")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(DataExplorer)
library(ggplot2)
dat_sub <- dat %>% dplyr::select(P1,P2,Biosex,P5,P7,P8,P11:P14a,P14ab1:P14ab14,P15:P17c,P18b:P18d,P20a:P22a,P22e,P23c,P23e,P25:P26h,P27b,P27c,P27e:P29f,P30:P39a,P40a:P40d,P41b:P41e,P41g:P42d,P45,P47a:P47e,P49d,P49a,P49c:P50d,P52a:P52c,P54a:P56c,P58a:P58c,P59:P67,P69a:P70a,P70c:P71aL1,P74L1a:P71d,P74,P76,P80c:P80g,P80i:P81,P86:P87e,P94,P95,P97a:P99g,Overweight,FiveFV,Binge,Region)
col_names <- names(dat_sub)
dat_sub[,col_names] <- lapply(dat_sub[,col_names] , factor)
dat_sub_named <- dat_sub %>% mutate(SelfPerceivedHealth = as.factor(P28))
introduce(dat_sub_named)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 170128 188 188 0 0
## total_missing_values complete_rows total_observations memory_usage
## 1 7942492 0 31984064 128090896
plot_missing(dat_sub_named)
plot_bar(dat_sub_named)
Note: Outcome of interest
Self-perceived health
1 Excellent 2 Very good 3 Good 4 Fair 5 Poor